| SOCR ≫ | DSPA ≫ | DSPA2 Topics ≫ |
total_cols: 22940L total rows: 502493
initial_read <- fread("ukbb_data/ukb44534_compiled_tab-001.csv", nrows = 1)
total_cols <- ncol(initial_read)
total_rows <- as.numeric(system("wc -l < ukbb_data/ukb44534_compiled_tab-001.csv", intern=TRUE))This code block sets up parallel processing capabilities, defines file paths, and initializes progress tracking. It includes a function to process each column of the dataset, determining the data type and saving summaries.
## Loading required package: iterators
## Loading required package: parallel
cores <- detectCores() - 1
registerDoParallel(cores)
data_path <- "ukbb_data/1000_rows.csv"
progress_path <- "progress_log.csv"
total_cols <- 29L
plot_dir <- paste0("plots/")
if (!dir.exists(plot_dir)) {
dir.create(plot_dir, recursive = TRUE)
}
summary_dir <- paste0("summaries/")
if (!dir.exists(summary_dir)) {
dir.create(summary_dir, recursive = TRUE)
}
if (file.exists(progress_path)) {
progress <- fread(progress_path)
start_col <- max(progress$col_processed) + 1
} else {
fwrite(data.table(col_processed = integer(0)), progress_path)
start_col <- 2
}
process_column <- function(col) {
column_titles <- fread(data_path, nrows = 1)
col_title <- names(column_titles)[col]
current_col_sample <- fread(data_path, select = col, nrows = 30)
detected_type <- "Other"
sample_values <- na.omit(current_col_sample[[1]])
if (length(sample_values) == 0) {
detected_type <- "Empty"
} else if (all(is.numeric(sample_values))) {
if (all(sample_values %in% c(0, 1))) {
detected_type <- "Binary"
} else {
detected_type <- "Numeric"
}
} else if (any(grepl("^\\d{4}-\\d{2}-\\d{2}$", sample_values))) {
detected_type <- "Date"
} else {
detected_type <- "Categorical"
current_col <- fread(data_path, select = col)
freq <- table(current_col[[1]], useNA = "ifany")
freq <- as.data.frame(freq)
freq$Prop <- freq$Freq / sum(freq$Freq)
fwrite(freq, paste0("summaries/", col_title, ".csv"))
}
fwrite(data.table(col_processed = col), progress_path, append = TRUE)
return(list(colname = col_title, type = detected_type))
}
results <- foreach(col = start_col:total_cols, .combine = 'rbind', .packages = 'data.table') %dopar% {
process_column(col)
}
results <- as.data.table(results)
fwrite(results, "column_types_summary.csv")
print(results)## colname
## 1: peak_expiratory_flow_pef_f3064_2_1
## 2: triplet_played_left_f4229_0_9
## 3: triplet_entered_left_f4236_0_8
## 4: target_number_to_be_entered_f4252_1_1
## 5: number_entered_by_participant_f4258_0_12
## 6: visual_acuity_result_in_round_left_f5082_1_8
## 7: ecg_stage_name_f5988_1_27
## 8: total_peripheral_resistance_during_pwa_f12685_2_0
## 9: operation_code_f20004_2_4
## 10: method_of_recording_time_when_noncancer_illness_first_diagnosed_f20013_2_11
## 11: workplace_full_of_chemical_or_other_fumes_f22610_0_24
## 12: breathing_problems_during_period_of_job_f22616_0_19
## 13: values_wanted_f23321_2_78
## 14: values_wanted_f23321_2_104
## 15: values_wanted_f23321_3_16
## 16: mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 17: mean_icvf_in_anterior_corona_radiata_on_fa_skeleton_right_f25366_3_0
## 18: mean_od_in_superior_corona_radiata_on_fa_skeleton_left_f25417_3_0
## 19: volume_of_molecularlayerhphead_left_hemisphere_f26629_3_0
## 20: mean_thickness_of_precentral_right_hemisphere_f27288_3_0
## 21: volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## 22: urea_reportability_f30676_1_0
## 23: histology_of_cancer_tumour_f40011_16_0
## 24: external_ca_f41201_0_6
## 25: main_speciality_of_consultant_polymorphic_f41207_0_11
## 26: date_of_first_inpatient_diagnosis_main_icd10_f41262_0_33
## 27: date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 28: typical_diet_yesterday_f100020_4_0
## colname
## type
## 1: Numeric
## 2: Numeric
## 3: Numeric
## 4: Empty
## 5: Empty
## 6: Empty
## 7: Categorical
## 8: Numeric
## 9: Empty
## 10: Empty
## 11: Empty
## 12: Empty
## 13: Empty
## 14: Empty
## 15: Numeric
## 16: Numeric
## 17: Empty
## 18: Empty
## 19: Empty
## 20: Empty
## 21: Numeric
## 22: Binary
## 23: Empty
## 24: Empty
## 25: Empty
## 26: Empty
## 27: Date
## 28: Binary
## type
This code block sets up for data visualization by reading a summary file and preparing a pie chart to visualize the distribution of data types.
file_path <- "column_types_summary.csv"
data <- fread(file_path)
type_counts <- table(data$type)
type_counts_df <- as.data.frame(type_counts)
names(type_counts_df) <- c("Type", "Frequency")
type_counts_df$Prop <- type_counts_df$Frequency / sum(type_counts_df$Frequency)
if (nrow(type_counts_df) > 1) {
pie_colors <- rainbow(nrow(type_counts_df))
png(file = "type_distribution_pie_chart.png")
pie(type_counts_df$Frequency, labels = paste0(type_counts_df$Type, ": ", round(type_counts_df$Prop * 100, 1), "%"),
col = pie_colors, main = "Pie Chart of Data Types")
dev.off()
} else {
cat("Not enough data to plot a pie chart.\n")
}## quartz_off_screen
## 2
Read subset for toy analysis
data <- fread('ukbb_data/1000_rows.csv', na.strings = c("", "NA", "na"))
data <- convert_dates_to_numeric(data)## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
data <- apply_label_encoding(data)
data <- normalize_df(data)
data <- data %>% select(which(colSums(is.na(data)) != nrow(data)) %>% names())
data <- missForest(data, verbose = TRUE)$ximp## missForest iteration 1 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.04060135
## time: 0.026 seconds
##
## missForest iteration 2 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.02085736
## time: 0.024 seconds
##
## missForest iteration 3 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.01624264
## time: 0.027 seconds
##
## missForest iteration 4 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.01359888
## time: 0.029 seconds
##
## missForest iteration 5 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.008524644
## time: 0.024 seconds
##
## missForest iteration 6 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.01154912
## time: 0.023 seconds
This section performs a K-means clustering analysis. It identifies the optimal number of clusters using silhouette scores and show visualization of the of the K index parameter versus
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2) # for plotting
set.seed(123)
optimal_k <- 2
highest_silhouette <- 0
sil_scores <- numeric(29) # Vector to store average silhouette scores for each k
for(k in 2:30) {
km.res <- kmeans(data_matrix, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(data_matrix))
avg.sil <- mean(ss[, 3])
sil_scores[k-1] <- avg.sil # Store the silhouette score
if(avg.sil > highest_silhouette){
highest_silhouette <- avg.sil
optimal_k <- k
}
}
clusters <- kmeans(data_matrix, centers = optimal_k, nstart = 25)
cat("Optimal number of clusters:", optimal_k, "\n")## Optimal number of clusters: 3
## K-means clustering with 3 clusters of sizes 4, 34, 25
##
## Cluster means:
## number_of_operations_selfreported_f136_0_0 peak_expiratory_flow_pef_f3064_2_1
## 1 0.0937500 0.5172072
## 2 0.2205882 0.7152751
## 3 0.2200000 0.5399941
## triplet_played_left_f4229_0_9 triplet_entered_left_f4236_0_8
## 1 0.7192982 0.1721622
## 2 0.4419424 0.2673173
## 3 0.4532129 0.6596246
## ecg_stage_name_f5988_1_27 total_peripheral_resistance_during_pwa_f12685_2_0
## 1 0.2425000 0.9285518
## 2 0.1523529 0.9095128
## 3 0.5964000 0.5586293
## values_wanted_f23321_3_16
## 1 0
## 2 0
## 3 0
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 1 0.7160994
## 2 0.7359227
## 3 0.4031346
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## 1 0.4573078
## 2 0.5419368
## 3 0.2915750
## urea_reportability_f30676_1_0
## 1 0
## 2 0
## 3 0
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 1 0.6734667
## 2 0.7579405
## 3 0.2110482
## typical_diet_yesterday_f100020_4_0
## 1 0.0000000
## 2 0.9211765
## 3 0.8848000
##
## Clustering vector:
## [1] 3 3 2 2 2 3 2 3 2 2 2 2 1 2 3 2 2 2 1 2 2 3 1 2 2 2 2 2 2 3 3 2 2 3 3 3 3 2
## [39] 2 3 2 3 2 2 2 3 3 1 3 3 3 2 3 2 3 3 2 3 3 2 2 3 2
##
## Within cluster sum of squares by cluster:
## [1] 1.323683 6.254919 7.267960
## (between_SS / total_SS = 54.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Plotting the silhouette score vs. number of clusters
k_values <- 2:30
sil_plot <- ggplot(data = data.frame(k_values, sil_scores), aes(x = k_values, y = sil_scores)) +
geom_line() +
scale_x_continuous(breaks = 2:30) +
theme_minimal() +
ggtitle("Silhouette Score vs. Number of Clusters") +
xlab("Number of Clusters (k)") +
ylab("Average Silhouette Score")
print(sil_plot)Here, PCA, t-SNE, and UMAP techniques are applied for dimensionality reduction. The results are prepared for subsequent visualization.
# PCA
non_constant_data_matrix <- data_matrix[, apply(data_matrix, 2, var) != 0]
pca_result_2d <- prcomp(non_constant_data_matrix, scale. = TRUE, center = TRUE)
pca_2d <- pca_result_2d$x[, 1:2]
pca_3d <- pca_result_2d$x[, 1:3]
# t-sne
tsne_2d <- Rtsne(data_matrix, dims = 2, perplexity=20, verbose=TRUE) ## Performing PCA
## Read the 63 x 12 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.979592)!
## Learning embedding...
## Iteration 50: error is 47.812786 (50 iterations in 0.00 seconds)
## Iteration 100: error is 54.407321 (50 iterations in 0.00 seconds)
## Iteration 150: error is 47.916957 (50 iterations in 0.00 seconds)
## Iteration 200: error is 52.413016 (50 iterations in 0.00 seconds)
## Iteration 250: error is 51.839231 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.493987 (50 iterations in 0.00 seconds)
## Iteration 350: error is 0.925483 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.468466 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.326026 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.234702 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.216714 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.168604 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.152610 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.146334 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.136861 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.137959 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.140215 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.139350 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.138507 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.139166 (50 iterations in 0.00 seconds)
## Fitting performed in 0.05 seconds.
## Performing PCA
## Read the 63 x 12 data matrix successfully!
## Using no_dims = 3, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.979592)!
## Learning embedding...
## Iteration 50: error is 46.745493 (50 iterations in 0.00 seconds)
## Iteration 100: error is 48.739883 (50 iterations in 0.00 seconds)
## Iteration 150: error is 47.681479 (50 iterations in 0.00 seconds)
## Iteration 200: error is 49.399400 (50 iterations in 0.00 seconds)
## Iteration 250: error is 47.451295 (50 iterations in 0.00 seconds)
## Iteration 300: error is 0.857982 (50 iterations in 0.00 seconds)
## Iteration 350: error is 0.287598 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.156503 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.125532 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.117372 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.113466 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.109056 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.107365 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.105148 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.104121 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.105026 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.104542 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.104130 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.103545 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.105916 (50 iterations in 0.00 seconds)
## Fitting performed in 0.07 seconds.
pca_2d_df <- as.data.frame(pca_2d) # PCA 2d
pca_2d_df$cluster <- as.factor(clusters$cluster)
ggplot(pca_2d_df, aes(x = PC1, y = PC2, color = cluster)) + geom_point() + theme_minimal() + ggtitle("PCA 2D Visualization with Cluster Coloring")data_2d <- as.data.frame(tsne_2d$Y) # t-sne 2d
data_2d$cluster <- as.factor(clusters$cluster) # add cluster result
data_3d <- as.data.frame(tsne_3d$Y)
data_3d$cluster <- as.factor(clusters$cluster)
ggplot(data_2d, aes(x = V1, y = V2, color = cluster)) + geom_point() + theme_minimal() + ggtitle("t-SNE 2D Visualization with Cluster Coloring")data_3d <- as.data.frame(tsne_3d$Y)
data_3d$cluster <- as.factor(clusters$cluster)
data_2d <- data.frame(X = umap_result_2d$layout[,1], Y = umap_result_2d$layout[,2]) # umap 2d
data_2d$cluster <- as.factor(clusters$cluster)
ggplot(data_2d, aes(x = X, y = Y, color = cluster)) +
geom_point() +
theme_minimal() +
ggtitle("UMAP 2D Visualization with Cluster Coloring")data_3d <- data.frame(X = umap_result_3d$layout[,1], Y = umap_result_3d$layout[,2], Z = umap_result_3d$layout[,3])
data_3d <- data.frame(X = umap_result_3d$layout[,1], Y = umap_result_3d$layout[,2], Z = umap_result_3d$layout[,3])
data_3d$cluster <- as.factor(clusters$cluster)
fig <- plot_ly(data_3d, x = ~X, y = ~Y, z = ~Z, color = ~cluster, colors = RColorBrewer::brewer.pal(length(unique(data_3d$cluster)),"Dark2"), type = 'scatter3d', mode = 'markers')
figThis code performs feature selection by conducting pairwise t-tests between clusters for each feature in the data matrix, adjusting the p-values using the Benjamini-Hochberg method to identify statistically significant features. It creates a summary table to display significant features and their adjusted p-values.
results_ttest <- list()
feature_names <- colnames(data_matrix)
unique_clusters <- unique(clusters$cluster)
for(i in 1:ncol(data_matrix)) {
feature_results <- list()
for(j in 1:(length(unique_clusters)-1)) {
for(k in (j+1):length(unique_clusters)) {
group1 <- data_matrix[clusters$cluster == unique_clusters[j], i]
group2 <- data_matrix[clusters$cluster == unique_clusters[k], i]
if(var(group1) != 0 && var(group2) != 0) {
feature_results[[paste(unique_clusters[j], unique_clusters[k], sep="_")]] <- t.test(group1, group2)$p.value
} else {
feature_results[[paste(unique_clusters[j], unique_clusters[k], sep="_")]] <- NA
}
}
}
results_ttest[[feature_names[i]]] <- feature_results
}
all_p_values <- sapply(results_ttest, function(feature) {
min_p_value <- min(unlist(feature), na.rm = TRUE)
return(ifelse(is.infinite(min_p_value), NA, min_p_value))
})## Warning in min(unlist(feature), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in min(unlist(feature), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
p_adjusted <- p.adjust(all_p_values, method = "BH")
significant_features <- which(p_adjusted < 0.05)
summary_table <- data.frame(
Feature = feature_names[significant_features],
P_Value = p_adjusted[significant_features]
)
print(summary_table)## Feature
## peak_expiratory_flow_pef_f3064_2_1 peak_expiratory_flow_pef_f3064_2_1
## triplet_entered_left_f4236_0_8 triplet_entered_left_f4236_0_8
## ecg_stage_name_f5988_1_27 ecg_stage_name_f5988_1_27
## total_peripheral_resistance_during_pwa_f12685_2_0 total_peripheral_resistance_during_pwa_f12685_2_0
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0 volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12 date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## P_Value
## peak_expiratory_flow_pef_f3064_2_1 1.450792e-05
## triplet_entered_left_f4236_0_8 4.876289e-09
## ecg_stage_name_f5988_1_27 1.442499e-20
## total_peripheral_resistance_during_pwa_f12685_2_0 1.367132e-08
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 2.823986e-11
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0 8.229486e-12
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12 4.692706e-26
This script applies the Boruta algorithm to identify important features in the data matrix, visualizing their importance using box-and-whisker plots, and then confirms the selection of significant features using the TentativeRoughFix function.
library(Boruta)
library(plotly)
library(tidyr)
library(dplyr)
cluster_numbers <- clusters$cluster
cluster_factors <- as.factor(cluster_numbers)
set.seed(123) #
data_matrix_df <- as.data.frame(data_matrix)
boruta_output <- Boruta(cluster_factors ~ ., data=data_matrix_df, doTrace = 0)
library(plotly)
# plot(als, xlab="", xaxt="n")
# lz<-lapply(1:ncol(als$ImpHistory), function(i)
# als$ImpHistory[is.finite(als$ImpHistory[, i]), i])
# names(lz)<-colnames(als$ImpHistory)
# lb<-sort(sapply(lz, median))
# axis(side=1, las=2, labels=names(lb), at=1:ncol(als$ImpHistory), cex.axis=0.5, font = 4)
df_long <- tidyr::gather(as.data.frame(boruta_output$ImpHistory), feature, measurement)
plot_ly(df_long, x=~feature, y = ~measurement, color = ~feature, type = "box") %>%
layout(title="Box-and-whisker Plots across all features (UKBB Data)",
xaxis = list(title="Features", categoryorder = "total descending"),
yaxis = list(title="Importance"), showlegend=F)## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
## Boruta performed 21 iterations in 0.1533029 secs.
## 9 attributes confirmed important:
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12,
## ecg_stage_name_f5988_1_27,
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0,
## peak_expiratory_flow_pef_f3064_2_1,
## total_peripheral_resistance_during_pwa_f12685_2_0 and 4 more;
## 3 attributes confirmed unimportant:
## number_of_operations_selfreported_f136_0_0,
## urea_reportability_f30676_1_0, values_wanted_f23321_3_16;
## number_of_operations_selfreported_f136_0_0
## Rejected
## peak_expiratory_flow_pef_f3064_2_1
## Confirmed
## triplet_played_left_f4229_0_9
## Confirmed
## triplet_entered_left_f4236_0_8
## Confirmed
## ecg_stage_name_f5988_1_27
## Confirmed
## total_peripheral_resistance_during_pwa_f12685_2_0
## Confirmed
## values_wanted_f23321_3_16
## Rejected
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## Confirmed
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## Confirmed
## urea_reportability_f30676_1_0
## Rejected
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## Confirmed
## typical_diet_yesterday_f100020_4_0
## Confirmed
## Levels: Tentative Confirmed Rejected
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
## Call:
## knockoff.filter(X = data, y = as.numeric(cluster_factors), fdr = 0.8)
##
## Selected variables:
## number_of_operations_selfreported_f136_0_0
## 1
## peak_expiratory_flow_pef_f3064_2_1
## 2
## triplet_played_left_f4229_0_9
## 3
## ecg_stage_name_f5988_1_27
## 5
## total_peripheral_resistance_during_pwa_f12685_2_0
## 6
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 7
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 9
## typical_diet_yesterday_f100020_4_0
## 10